

*********************** LOAD DATA FOR ANALYSIS ***********************************

clear
import delim using LifeScience_manuscript, varn(1) case(preserve)

* LifeScience_manuscript.csv – data for replication of results presented in main manuscript
* LifeScience_noalpha.csv – data for replication of results presented in supplement: "Excl. articles with alphabetical authorship"
* LifeScience_allgender.csv – data for replication of results presented in supplement: "Excl. articles with missing gender of authors"
* LifeScience_allauthorships.csv – data for replication of results presented in supplement: "Incl. articles with with single and dual a'ships"
* LifeScience_85sex.csv – data for replication of results presented in supplement: "Sex designation threshold of 85% confidence"
* LifeScience_95sex.csv – data for replication of results presented in supplement: "Sex designation threshold of 95% confidence"

**********************************************************************************


*Create authorship variables 
gen a_ship = 1
gen female_aship = cond(sex == 1,1,0)
gen male_aship = cond(sex == 0,1,0)
bysort pmid (author_order): gen nvals = _n == 1  
tab nvals
gen first_female = cond(author_order == 1, cond(sex == 1,1,0),0)  
gen last_female = cond(author_order == n_authors, cond(sex == 1,1,0),0)  
gen first_male = cond(author_order == 1, cond(sex == 0,1,0),0)  
gen last_male = cond(author_order == n_authors, cond(sex == 0,1,0),0)  
gen inter_female = cond(author_order > 1, cond(author_order < n_authors, cond(sex == 1,1,0),0),0)
gen inter_male = cond(author_order > 1, cond(author_order < n_authors, cond(sex == 0,1,0),0),0)
rename link_year j_date_year

***		CREATE RR STATISTICS BY DECADE 

collapse (sum) pmids = nvals a_ships = a_ship first_female last_female female_aship first_male last_male male_aship inter_female inter_male, by(j_date_year)

gen decade = cond(j_date_year < 1995,1,cond(j_date_year < 2005,2,3))
collapse (sum) pmids a_ships first_female last_female female_aship first_male last_male male_aship inter_female inter_male, by(decade)

*Rates of prestigious authorship: Probability of first (last) female (male) authorship conditional on authorship is female (male)  		
gen cprob_lastm = (last_male/male_aship)
gen cprob_lastf = (last_female/female_aship)
gen cprob_firstf = first_female/(first_female + inter_female)
gen cprob_firstm = first_male/(first_male + inter_male)

*Relative risk of prestigious authorships and confidence intervals
gen ratio_fm_last = cprob_lastf / cprob_lastm
gen p1 = (last_female + 0.5) / (female_aship + 1)
gen p2 = (last_male + 0.5) / (male_aship + 1)
gen CI_U_last = p1/p2*exp(1.96*(((1-p1)/(female_aship*p1))+((1-p2)/(male_aship*p2)))^0.5)   
gen CI_L_last = p1/p2*exp(-1.96*(((1-p1)/(female_aship*p1))+((1-p2)/(male_aship*p2)))^0.5)   
drop p1 p2

gen ratio_fm_first = cprob_firstf/cprob_firstm 
gen p1 = (first_female + 0.5) / (first_female + inter_female + 1)
gen p2 = (first_male + 0.5) / (first_male + inter_male + 1)
gen CI_U_first = p1/p2*exp(1.96*(((1-p1)/((first_female + inter_female)*p1))+((1-p2)/((first_male + inter_male))))^0.5)   
gen CI_L_first = p1/p2*exp(-1.96*(((1-p1)/((first_female + inter_female)*p1))+((1-p2)/((first_male + inter_male))))^0.5)   
drop p1 p2

